To Do

from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json
# Unzip data folder

import zipfile
with zipfile.ZipFile('../../data.zip', 'r') as zip_ref:
    zip_ref.extractall('..')
openai_api_key = '...'
# Temp = 0 so that we get clean information without a lot of creativity
chat_model = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens=1000)
# How you would like your response structured. This is basically a fancy prompt template
response_schemas = [
    ResponseSchema(name="input_industry", description="This is the input_industry from the user"),
    ResponseSchema(name="standardized_industry", description="This is the industry you feel is most closely matched to the users input"),
    ResponseSchema(name="match_score",  description="A score 0-100 of how close you think the match is between user input and your match")
]

# How you would like to parse your output
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()
print (output_parser.get_format_instructions())
The output should be a markdown code snippet formatted in the following schema:

```json
{
	"input_industry": string  // This is the input_industry from the user
	"standarized_industry": string  // This is the industry you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```
template = """
You will be given a series of industry names from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

{format_instructions}

Wrap your final output with closed and open brackets (a list of json objects)

input_industry INPUT:
{user_industries}

STANDARDIZED INDUSTRIES:
{standardized_industries}

YOUR RESPONSE:
"""

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template)  
    ],
    input_variables=["user_industries", "standardized_industries"],
    partial_variables={"format_instructions": format_instructions}
)
# Get your standardized names. You can swap this out with whatever list you want!
df = pd.read_csv('../data/LinkedInIndustries.csv')
standardized_industries = ", ".join(df['Industry'].values)
standardized_industries
'Corporate Services, Recreation & Travel, Legal, Wellness & Fitness, Entertainment, Consumer Goods, Design, Arts, Manufacturing, Finance, Health Care, Construction, Nonprofit, Real Estate, Software & IT Services, Hardware & Networking, Agriculture, Education, Public Administration, Transportation & Logistics, Public Safety, Media & Communications, Energy & Mining, Retail'
# Your user input

user_input = "air LineZ, airline, aviation, planes that fly, farming, bread, wifi networks, twitter media agency"

_input = prompt.format_prompt(user_industries=user_input, standardized_industries=standardized_industries)


print (f"There are {len(_input.messages)} message(s)")
print (f"Type: {type(_input.messages[0])}")
print ("---------------------------")
print (_input.messages[0].content)
There are 1 message(s)
Type: <class 'langchain.schema.HumanMessage'>
---------------------------

You will be given a series of industry names from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

The output should be a markdown code snippet formatted in the following schema:

```json
{
	"input_industry": string  // This is the input_industry from the user
	"standarized_industry": string  // This is the industry you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```

Wrap your final output with closed and open brackets (a list of json objects)

input_industry INPUT:
air LineZ, airline, aviation, planes that fly, farming, bread, wifi networks, twitter media agency

STANDARDIZED INDUSTRIES:
Corporate Services, Recreation & Travel, Legal, Wellness & Fitness, Entertainment, Consumer Goods, Design, Arts, Manufacturing, Finance, Health Care, Construction, Nonprofit, Real Estate, Software & IT Services, Hardware & Networking, Agriculture, Education, Public Administration, Transportation & Logistics, Public Safety, Media & Communications, Energy & Mining, Retail

YOUR RESPONSE:
output = chat_model(_input.to_messages())
print (type(output))
print (output.content)
<class 'langchain.schema.AIMessage'>


[
	{
		"input_industry": "air LineZ",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "80"
	},
	{
		"input_industry": "airline",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "90"
	},
	{
		"input_industry": "aviation",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "planes that fly",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "85"
	},
	{
		"input_industry": "farming",
		"standarized_industry": "Agriculture",
		"match_score": "90"
	},
	{
		"input_industry": "bread",
		"standarized_industry": "Consumer Goods",
		"match_score": "80"
	},
	{
		"input_industry": "wifi networks",
		"standarized_industry": "Hardware & Networking",
		"match_score": "95"
	},
	{
		"input_industry": "twitter media agency",
		"standarized_industry": "Media & Communications",
		"match_score": "90"
	}
]
if "```json" in output.content:
    json_string = output.content.split("```json")[1].strip()
else:
    json_string = output.content
print(output.content)
[
	{
		"input_industry": "air LineZ",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "80"
	},
	{
		"input_industry": "airline",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "90"
	},
	{
		"input_industry": "aviation",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "planes that fly",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "85"
	},
	{
		"input_industry": "farming",
		"standarized_industry": "Agriculture",
		"match_score": "90"
	},
	{
		"input_industry": "bread",
		"standarized_industry": "Consumer Goods",
		"match_score": "80"
	},
	{
		"input_industry": "wifi networks",
		"standarized_industry": "Hardware & Networking",
		"match_score": "95"
	},
	{
		"input_industry": "twitter media agency",
		"standarized_industry": "Media & Communications",
		"match_score": "90"
	}
]
# output_parser.parse(output.content) Ideally this works but not in all cases
structured_data = json.loads(output.content)
structured_data
[{'input_industry': 'air LineZ',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '80'},
 {'input_industry': 'airline',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '90'},
 {'input_industry': 'aviation',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '95'},
 {'input_industry': 'planes that fly',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '85'},
 {'input_industry': 'farming',
  'standarized_industry': 'Agriculture',
  'match_score': '90'},
 {'input_industry': 'bread',
  'standarized_industry': 'Consumer Goods',
  'match_score': '80'},
 {'input_industry': 'wifi networks',
  'standarized_industry': 'Hardware & Networking',
  'match_score': '95'},
 {'input_industry': 'twitter media agency',
  'standarized_industry': 'Media & Communications',
  'match_score': '90'}]
pd.DataFrame(structured_data)
input_industry standarized_industry match_score
0 air LineZ Transportation & Logistics 80
1 airline Transportation & Logistics 90
2 aviation Transportation & Logistics 95
3 planes that fly Transportation & Logistics 85
4 farming Agriculture 90
5 bread Consumer Goods 80
6 wifi networks Hardware & Networking 95
7 twitter media agency Media & Communications 90

To Do#

  1. Look at new incoming industries from the user

  2. Match against your data base of values you’ve already mapped

  3. For existing ones, save an API call and get the result from the data base

  4. For new ones, batch them together for your LLM to return back to you